CSU_ESS330_Lab6

Author

Olivia Gilpin

Published

April 3, 2025

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
✔ broom        1.0.7     ✔ rsample      1.2.1
✔ dials        1.3.0     ✔ tune         1.2.1
✔ infer        1.0.7     ✔ workflows    1.1.4
✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
✔ parsnip      1.2.1     ✔ yardstick    1.3.2
✔ recipes      1.1.0     
── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter()   masks stats::filter()
✖ recipes::fixed()  masks stringr::fixed()
✖ dplyr::lag()      masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step()   masks stats::step()
• Use suppressPackageStartupMessages() to eliminate package startup messages
library(powerjoin)
library(glue)
library(vip)

Attaching package: 'vip'

The following object is masked from 'package:utils':

    vi
library(baguette)
library(ggpubr)
library(xgboost)

Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice
root  <- 'https://gdex.ucar.edu/dataset/camels/file'
download.file('https://gdex.ucar.edu/dataset/camels/file/camels_attributes_v2.0.pdf', 
              'data/camels_attributes_v2.0.pdf')
types <- c("clim", "geol", "soil", "topo", "vege", "hydro")
# Where the files live online ...
remote_files  <- glue('{root}/camels_{types}.txt')
# where we want to download the data ...
local_files   <- glue('data/camels_{types}.txt')
walk2(remote_files, local_files, download.file, quiet = TRUE)
# Read and merge data
camels <- map(local_files, read_delim, show_col_types = FALSE) 
camels <- power_full_join(camels ,by = 'gauge_id')

Question 1: Your Turn

ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = q_mean)) +
  scale_color_gradient(low = "pink", high = "dodgerblue") +
  ggthemes::theme_map()

Q1 Report what zero_q_freq represents: zero_q_freq represents all the data sources and their descriptions, which represent the percentage days that there were zero stream flow based on location.

Question 2: Your Turn

camels |> 
  select(aridity, p_mean, q_mean) |> 
  drop_na() |> 
  cor()
           aridity     p_mean     q_mean
aridity  1.0000000 -0.7550090 -0.5817771
p_mean  -0.7550090  1.0000000  0.8865757
q_mean  -0.5817771  0.8865757  1.0000000

a.

# Create a scatter plot of aridity vs rainfall
ggplot(camels, aes(x = aridity, y = p_mean)) +
  # Add points colored by mean flow
  geom_point(aes(color = q_mean)) +
  # Add a linear regression line
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  # Apply the viridis color scale
  scale_color_viridis_c() +
  # Add a title, axis labels, and theme (w/ legend on the bottom)
  theme_linedraw() + 
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

Test transformation:

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c() +
  # Apply log transformations to the x and y axes
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  # Apply a log transformation to the color scale
  scale_color_viridis_c(trans = "log") +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom",
        # Expand the legend width ...
        legend.key.width = unit(2.5, "cm"),
        legend.key.height = unit(.5, "cm")) + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow") 
`geom_smooth()` using formula = 'y ~ x'

set.seed(123)
# Bad form to perform simple transformations on the outcome variable within a 
# recipe. So, we'll do it here.
camels <- camels |> 
  mutate(logQmean = log(q_mean))

# Generate the split
camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)

Build a recipe

# Create a recipe to preprocess the data
rec <-  recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
  # Log transform the predictor variables (aridity and p_mean)
  step_log(all_predictors()) %>%
  # Add an interaction term between aridity and p_mean
  step_interact(terms = ~ aridity:p_mean) |> 
  # Drop any rows with missing values in the pred
  step_naomit(all_predictors(), all_outcomes())

Fitting a linear model to the data

# Prepare the data
baked_data <- prep(rec, camels_train) |> 
  bake(new_data = NULL)

# Interaction with lm
#  Base lm sets interaction terms with the * symbol
lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)

Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.77586    0.16365 -10.852  < 2e-16 ***
aridity        -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean          1.48438    0.15511   9.570  < 2e-16 ***
aridity:p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16

Correct version: prep -> bake -> predict

test_data <-  bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)

Model Evaluation: statistical and visual

metrics(test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
  # Apply a gradient color scale
  scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_linedraw() + 
  labs(title = "Linear Model: Observed vs Predicted",
       x = "Observed Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

Using a workflow instead

# Define model
lm_model <- linear_reg() %>%
  # define the engine
  set_engine("lm") %>%
  # define the mode
  set_mode("regression")

# Instantiate a workflow ...
lm_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(lm_model) %>%
  # Fit the model to the training data
  fit(data = camels_train) 

# Extract the model coefficients from the workflow
summary(extract_fit_engine(lm_wf))$coefficients
                   Estimate Std. Error    t value     Pr(>|t|)
(Intercept)      -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity          -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean            1.4843771 0.15511117   9.569762 4.022500e-20
aridity_x_p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
# From the base implementation
summary(lm_base)$coefficients
                 Estimate Std. Error    t value     Pr(>|t|)
(Intercept)    -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity        -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean          1.4843771 0.15511117   9.569762 4.022500e-20
aridity:p_mean  0.1048449 0.07198145   1.456555 1.458304e-01

Making Predictions

lm_data <- augment(lm_wf, new_data = camels_test)
dim(lm_data)
[1] 135  61

Model Evaluation: statistical and visual

metrics(lm_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(lm_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

Switch it up!

library(baguette)
rf_model <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(rf_model) %>%
  # Fit the model
  fit(data = camels_train) 

Predictions

rf_data <- augment(rf_wf, new_data = camels_test)
dim(rf_data)
[1] 135  60

Model Evaluation: statistical and visual

metrics(rf_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.592
2 rsq     standard       0.736
3 mae     standard       0.367
ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

A workflowset approach

wf <- workflow_set(list(rec), list(lm_model, rf_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 4 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     1
2 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     1
3 recipe_rand_fore… Prepro… rmse    0.565  0.0249    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.769  0.0261    10 recipe       rand…     2

Q2: Make 2 maps of the sites, coloring the points by the aridty and p_mean column. Add clear labels, titles, and a color scale that makes sense for each parameter.Ensure these render as a single image with your choice of facet_*, patchwork, or ggpubr.

p1 <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = aridity)) +
  scale_color_gradient(low = "hotpink", high = "purple") +
  labs(x = "Longitude", y = "Latitude", title = "Patterns of Aridity Across the United States") + 
  ggthemes::theme_map()
p2 <- ggplot(data = camels, aes(x = gauge_lon, y = gauge_lat)) +
  borders("state", colour = "gray50") +
  geom_point(aes(color = p_mean)) +
  scale_color_gradient(low = "limegreen", high = "skyblue") +
  labs(x = "Longitude", y = "Latitude", title = "U.S. Mean Daily Precipitation Patterns") + 
  ggthemes::theme_map()
ggarrange(p1, p2, ncol = 2)

Check if significant correlation between these variables

camels |> 
  select(aridity, p_mean, q_mean) |> 
  drop_na() |> 
  cor()
           aridity     p_mean     q_mean
aridity  1.0000000 -0.7550090 -0.5817771
p_mean  -0.7550090  1.0000000  0.8865757
q_mean  -0.5817771  0.8865757  1.0000000

Visual EDA

# Create a scatter plot of aridity vs rainfall
ggplot(camels, aes(x = aridity, y = p_mean)) +
  # Add points colored by mean flow
  geom_point(aes(color = q_mean)) +
  # Add a linear regression line
  geom_smooth(method = "lm", color = "red", linetype = 2) +
  # Apply the viridis color scale
  scale_color_viridis_c() +
  # Add a title, axis labels, and theme (w/ legend on the bottom)
  theme_linedraw() + 
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

To test a transformation, we can log transform the x and y axes using the scale_x_log10() and scale_y_log10() functions:

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  scale_color_viridis_c() +
  # Apply log transformations to the x and y axes
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom") + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow")
`geom_smooth()` using formula = 'y ~ x'

Visualize how a log transform may benifit the q_mean data

ggplot(camels, aes(x = aridity, y = p_mean)) +
  geom_point(aes(color = q_mean)) +
  geom_smooth(method = "lm") +
  # Apply a log transformation to the color scale
  scale_color_viridis_c(trans = "log") +
  scale_x_log10() + 
  scale_y_log10() +
  theme_linedraw() +
  theme(legend.position = "bottom",
        # Expand the legend width ...
        legend.key.width = unit(2.5, "cm"),
        legend.key.height = unit(.5, "cm")) + 
  labs(title = "Aridity vs Rainfall vs Runnoff", 
       x = "Aridity", 
       y = "Rainfall",
       color = "Mean Flow") 
`geom_smooth()` using formula = 'y ~ x'

Model Building

set.seed(123)
# Bad form to perform simple transformations on the outcome variable within a 
# recipe. So, we'll do it here.
camels <- camels |> 
  mutate(logQmean = log(q_mean))

# Generate the split
camels_split <- initial_split(camels, prop = 0.8)
camels_train <- training(camels_split)
camels_test  <- testing(camels_split)

camels_cv <- vfold_cv(camels_train, v = 10)

Preprocessor: recipe`

# Create a recipe to preprocess the data
rec <-  recipe(logQmean ~ aridity + p_mean, data = camels_train) %>%
  # Log transform the predictor variables (aridity and p_mean)
  step_log(all_predictors()) %>%
  # Add an interaction term between aridity and p_mean
  step_interact(terms = ~ aridity:p_mean) |> 
  # Drop any rows with missing values in the pred
  step_naomit(all_predictors(), all_outcomes())

Naive base lm approach:

# Prepare the data
baked_data <- prep(rec, camels_train) |> 
  bake(new_data = NULL)

# Interaction with lm
#  Base lm sets interaction terms with the * symbol
lm_base <- lm(logQmean ~ aridity * p_mean, data = baked_data)
summary(lm_base)

Call:
lm(formula = logQmean ~ aridity * p_mean, data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)    -1.77586    0.16365 -10.852  < 2e-16 ***
aridity        -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean          1.48438    0.15511   9.570  < 2e-16 ***
aridity:p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16
# Sanity Interaction term from recipe ... these should be equal!!
summary(lm(logQmean ~ aridity + p_mean + aridity_x_p_mean, data = baked_data))

Call:
lm(formula = logQmean ~ aridity + p_mean + aridity_x_p_mean, 
    data = baked_data)

Residuals:
     Min       1Q   Median       3Q      Max 
-2.91162 -0.21601 -0.00716  0.21230  2.85706 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)      -1.77586    0.16365 -10.852  < 2e-16 ***
aridity          -0.88397    0.16145  -5.475 6.75e-08 ***
p_mean            1.48438    0.15511   9.570  < 2e-16 ***
aridity_x_p_mean  0.10484    0.07198   1.457    0.146    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.5696 on 531 degrees of freedom
Multiple R-squared:  0.7697,    Adjusted R-squared:  0.7684 
F-statistic: 591.6 on 3 and 531 DF,  p-value: < 2.2e-16

Correct version: prep -> bake -> predict

test_data <-  bake(prep(rec), new_data = camels_test)
test_data$lm_pred <- predict(lm_base, newdata = test_data)

Model Evaluation: statistical and visual

 metrics(test_data, truth = logQmean, estimate = lm_pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(test_data, aes(x = logQmean, y = lm_pred, colour = aridity)) +
  # Apply a gradient color scale
  scale_color_gradient2(low = "brown", mid = "orange", high = "darkgreen") +
  geom_point() +
  geom_abline(linetype = 2) +
  theme_linedraw() + 
  labs(title = "Linear Model: Observed vs Predicted",
       x = "Observed Log Mean Flow",
       y = "Predicted Log Mean Flow",
       color = "Aridity")

Using a workflow instead

# Define model
lm_model <- linear_reg() %>%
  # define the engine
  set_engine("lm") %>%
  # define the mode
  set_mode("regression")

# Instantiate a workflow ...
lm_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(lm_model) %>%
  # Fit the model to the training data
  fit(data = camels_train) 

# Extract the model coefficients from the workflow
summary(extract_fit_engine(lm_wf))$coefficients
                   Estimate Std. Error    t value     Pr(>|t|)
(Intercept)      -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity          -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean            1.4843771 0.15511117   9.569762 4.022500e-20
aridity_x_p_mean  0.1048449 0.07198145   1.456555 1.458304e-01
# From the base implementation
summary(lm_base)$coefficients
                 Estimate Std. Error    t value     Pr(>|t|)
(Intercept)    -1.7758557 0.16364755 -10.851710 6.463654e-25
aridity        -0.8839738 0.16144589  -5.475357 6.745512e-08
p_mean          1.4843771 0.15511117   9.569762 4.022500e-20
aridity:p_mean  0.1048449 0.07198145   1.456555 1.458304e-01

Making Predictions

#
lm_data <- augment(lm_wf, new_data = camels_test)
dim(lm_data)
[1] 135  61

Model Evaluation: statistical and visual

metrics(lm_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.583
2 rsq     standard       0.742
3 mae     standard       0.390
ggplot(lm_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

Switch it up!

library(baguette)
rf_model <- rand_forest() %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("regression")

rf_wf <- workflow() %>%
  # Add the recipe
  add_recipe(rec) %>%
  # Add the model
  add_model(rf_model) %>%
  # Fit the model
  fit(data = camels_train) 

Predictions

rf_data <- augment(rf_wf, new_data = camels_test)
dim(rf_data)
[1] 135  60

Model Evaluation: statistical and visual

metrics(rf_data, truth = logQmean, estimate = .pred)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.592
2 rsq     standard       0.736
3 mae     standard       0.367
ggplot(rf_data, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw()

A workflowset approach

wf <- workflow_set(list(rec), list(lm_model, rf_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 4 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_linear_reg Prepro… rmse    0.569  0.0260    10 recipe       line…     1
2 recipe_linear_reg Prepro… rsq     0.770  0.0223    10 recipe       line…     1
3 recipe_rand_fore… Prepro… rmse    0.565  0.0249    10 recipe       rand…     2
4 recipe_rand_fore… Prepro… rsq     0.769  0.0261    10 recipe       rand…     2

Question 3: Your Turn!

Q3:Build a xgboost (engine) regression (mode) model using boost_tree

xgBoost_model <- boost_tree(mode = "regression",
                            trees = 1000) |>
  set_engine('xgboost')

Q3:Build a neural network model using the nnet engine from the baguette package using the bag_mlp function

NeuralNet_Model <- bag_mlp(mode = "regression") |>
  set_engine('nnet')

Q3:Add this to the above workflow

xgbm_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(xgBoost_model) %>%
  fit(data = camels_train)

NeuralNet_Model_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(NeuralNet_Model) %>%
  fit(data = camels_train)

Q3:Evaluate the model and compare it to the linear and random forest models: The boosted tree and neural network models produced identical results, while the outcomes from the linear regression and random forest models showed slight discrepancies. I would proceed with the boosted tree and neural network models, as their results align closely with the 1:1 line, and the metrics are statistically significant.

xgbm_model <- boost_tree() %>%
  set_engine("xgboost") %>%
  set_mode("regression")

rec <- recipe(logQmean ~ p_mean + aridity + high_prec_dur, data = camels_train) %>%
  step_log(all_predictors())

xgbm_wf <- workflow() %>%
  add_model(xgbm_model) %>%
  add_recipe(rec)
camels_train_clean <- camels_train %>%
  mutate(
    p_mean = as.numeric(p_mean),
    aridity = as.numeric(aridity),
    high_prec_dur = as.numeric(high_prec_dur),
    logQmean = as.numeric(logQmean)
  ) %>%
  mutate(across(where(is.numeric), ~ifelse(is.finite(.) & !is.na(.) & abs(.) < 1e10, ., NA))) %>%
  drop_na()

rec <- recipe(logQmean ~ p_mean + aridity + high_prec_dur, data = camels_train_clean) %>%
  step_log(all_predictors()) %>%
  step_normalize(all_predictors())  

xgbm_model <- boost_tree() %>%
  set_engine("xgboost") %>%
  set_mode("regression")

xgbm_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(xgbm_model) %>%
  fit(data = camels_train_clean)


NeuralNet_Model_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(NeuralNet_Model) %>%
  fit(data = camels_train_clean)

xgb_predictions <- augment(xgbm_wf, new_data = camels_train_clean)
nn_predictions <- augment(NeuralNet_Model_wf, new_data = camels_train_clean)

library(yardstick)

xgb_metrics <- metrics(xgb_predictions, truth = logQmean, estimate = .pred)
print(xgb_metrics)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard      0.136 
2 rsq     standard      0.990 
3 mae     standard      0.0982
nn_metrics <- metrics(nn_predictions, truth = logQmean, estimate = .pred)
print(nn_metrics)
# A tibble: 3 × 3
  .metric .estimator .estimate
  <chr>   <chr>          <dbl>
1 rmse    standard       0.443
2 rsq     standard       0.876
3 mae     standard       0.288
ggplot(xgb_predictions, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw() +
  labs(
    title = "Observed vs Predicted logQmean (XGBoost Model)",
    x = "Observed logQmean",
    y = "Predicted logQmean",
    colour = "Aridity"
  )

ggplot(nn_predictions, aes(x = logQmean, y = .pred, colour = aridity)) +
  scale_color_viridis_c() +
  geom_point() +
  geom_abline() +
  theme_linedraw() +
  labs(
    title = "Observed vs Predicted logQmean (Neural Network Model)",
    x = "Observed logQmean",
    y = "Predicted logQmean",
    colour = "Aridity"
  )

autoplot(wf)

Q3:Which of the 4 models would you move forward with? I would move forward with the boosted tree and neural network models because their results align closely with the 1:1 line and the metrics are statistically significant, which indicates accurate and reliable predictions.

Question 4: Build your own

Q4a: Data Prep / Data Splitting

set.seed(1234)
resample_split <- initial_split(camels, prop = 0.75)
train_camels <- training(resample_split)
glimpse(train_camels)
Rows: 503
Columns: 59
$ gauge_id             <chr> "04224775", "02038850", "12447390", "14138800", "…
$ p_mean               <dbl> 2.914367, 3.149615, 2.559076, 7.729751, 3.617372,…
$ pet_mean             <dbl> 2.244688, 2.889448, 2.188394, 2.094345, 2.804693,…
$ p_seasonality        <dbl> 0.22762612, 0.05776050, -0.59836516, -0.81475942,…
$ frac_snow            <dbl> 0.204650191, 0.065522598, 0.777311498, 0.31726621…
$ aridity              <dbl> 0.7702146, 0.9173972, 0.8551500, 0.2709460, 0.775…
$ high_prec_freq       <dbl> 17.20, 23.75, 18.95, 12.55, 23.75, 21.75, 23.95, …
$ high_prec_dur        <dbl> 1.142857, 1.193467, 1.572614, 1.512048, 1.250000,…
$ high_prec_timing     <chr> "jja", "son", "djf", "djf", "mam", "mam", "jja", …
$ low_prec_freq        <dbl> 217.75, 267.65, 253.70, 200.85, 269.70, 260.55, 2…
$ low_prec_dur         <dbl> 3.151230, 4.979535, 5.720406, 5.641854, 5.272727,…
$ low_prec_timing      <chr> "jja", "son", "jja", "jja", "son", "son", "son", …
$ geol_1st_class       <chr> "Siliciclastic sedimentary rocks", "Siliciclastic…
$ glim_1st_class_frac  <dbl> 1.0000000, 0.7783589, 1.0000000, 0.7408656, 0.656…
$ geol_2nd_class       <chr> NA, "Metamorphics", NA, "Unconsolidated sediments…
$ glim_2nd_class_frac  <dbl> 0.00000000, 0.15483860, 0.00000000, 0.25913437, 0…
$ carbonate_rocks_frac <dbl> 0.000000000, 0.000000000, 0.000000000, 0.00000000…
$ geol_porostiy        <dbl> 0.1338, 0.0996, 0.0100, 0.1392, 0.1580, 0.0128, 0…
$ geol_permeability    <dbl> -16.2436, -15.8882, -14.1000, -12.0854, -14.0789,…
$ soil_depth_pelletier <dbl> 4.0527704, 1.0270270, 0.4727273, 0.8974359, 1.048…
$ soil_depth_statsgo   <dbl> 1.1054176, 1.5000000, 1.5000000, 0.8373043, 1.424…
$ soil_porosity        <dbl> 0.4642713, 0.4560548, 0.4344597, 0.4431248, 0.450…
$ soil_conductivity    <dbl> 0.9744709, 0.5862081, 1.8269572, 1.5541585, 0.892…
$ max_water_content    <dbl> 0.4783560, 0.7018239, 0.6421171, 0.3598810, 0.510…
$ sand_frac            <dbl> 22.77199, 22.18412, 45.32336, 39.28517, 22.49397,…
$ silt_frac            <dbl> 56.43728, 30.80408, 37.98377, 44.37471, 31.19531,…
$ clay_frac            <dbl> 16.719066, 47.145138, 16.705685, 16.460240, 24.23…
$ water_frac           <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.000…
$ organic_frac         <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000,…
$ other_frac           <dbl> 4.0893952, 0.0000000, 0.0000000, 0.0000000, 22.07…
$ gauge_lat            <dbl> 42.53562, 37.41542, 48.82292, 45.45262, 35.98318,…
$ gauge_lon            <dbl> -77.70416, -78.63584, -120.14592, -121.89147, -92…
$ elev_mean            <dbl> 519.45, 176.41, 1700.98, 821.62, 459.08, 387.44, …
$ slope_mean           <dbl> 36.93333, 9.80131, 172.58264, 142.23871, 41.94700…
$ area_gages2          <dbl> 231.84, 22.00, 58.10, 21.29, 2149.36, 240.77, 144…
$ area_geospa_fabric   <dbl> 232.59, 23.80, 58.40, 21.83, 2147.73, 241.31, 145…
$ frac_forest          <dbl> 0.8681, 0.9300, 0.9643, 1.0000, 0.8844, 0.9407, 0…
$ lai_max              <dbl> 4.3014541, 5.3991196, 1.1999529, 4.0763384, 4.958…
$ lai_diff             <dbl> 3.7519191, 4.3350245, 0.7890515, 2.3938173, 4.398…
$ gvf_max              <dbl> 0.8525770, 0.8844151, 0.4894325, 0.8705525, 0.870…
$ gvf_diff             <dbl> 0.4975784, 0.3792956, 0.1804294, 0.2019336, 0.491…
$ dom_land_cover_frac  <dbl> 0.5212560, 0.6610673, 0.8600801, 1.0000000, 0.895…
$ dom_land_cover       <chr> "    Deciduous Broadleaf Forest", "    Mixed Fore…
$ root_depth_50        <dbl> 0.1852126, 0.2296640, 0.1630040, 0.1700000, 0.188…
$ root_depth_99        <dbl> 1.760628, 2.264427, 1.758024, 1.800000, 1.947805,…
$ q_mean               <dbl> 1.12964683, 0.88795895, 1.36554402, 6.51073552, 1…
$ runoff_ratio         <dbl> 0.387613119, 0.281926158, 0.533608238, 0.84229565…
$ slope_fdc            <dbl> 1.6984974, 1.2903515, 0.8783967, 1.8185192, 1.670…
$ baseflow_index       <dbl> 0.506242202, 0.580267146, 0.534019978, 0.45786958…
$ stream_elas          <dbl> 0.02287331, 2.18611483, 0.86605745, 1.09287074, 2…
$ q5                   <dbl> 0.1055286208, 0.0814042409, 0.1221182286, 0.25281…
$ q95                  <dbl> 3.69350173, 2.55778352, 7.10812310, 23.55791390, …
$ high_q_freq          <dbl> 11.250000, 6.650000, 48.800000, 12.800000, 24.000…
$ high_q_dur           <dbl> 1.906780, 1.445652, 26.378378, 1.910448, 3.380282…
$ low_q_freq           <dbl> 79.15000, 43.15000, 146.50000, 107.65000, 152.800…
$ low_q_dur            <dbl> 10.993056, 12.507246, 33.295455, 24.191011, 32.86…
$ zero_q_freq          <dbl> 0.0000000000, 0.0000000000, 0.0000000000, 0.00000…
$ hfd_mean             <dbl> 164.4000, 165.3500, 241.3000, 135.8500, 167.8000,…
$ logQmean             <dbl> 0.12190505, -0.11882976, 0.31155290, 1.87345243, …
test_camels <- testing(resample_split)
glimpse(test_camels)
Rows: 168
Columns: 59
$ gauge_id             <chr> "01030500", "01047000", "01052500", "01055000", "…
$ p_mean               <dbl> 3.274405, 3.323146, 3.730858, 3.494183, 3.570500,…
$ pet_mean             <dbl> 2.043594, 2.090024, 2.096423, 2.093235, 2.132744,…
$ p_seasonality        <dbl> 0.047358189, 0.147775614, 0.152096803, 0.16722904…
$ frac_snow            <dbl> 0.27701840, 0.28011813, 0.35269825, 0.30604885, 0…
$ aridity              <dbl> 0.6241114, 0.6289293, 0.5619145, 0.5990628, 0.597…
$ high_prec_freq       <dbl> 17.15, 20.10, 13.50, 19.15, 20.35, 15.50, 18.65, …
$ high_prec_dur        <dbl> 1.207746, 1.165217, 1.129707, 1.167683, 1.169540,…
$ high_prec_timing     <chr> "son", "son", "jja", "son", "son", "son", "jja", …
$ low_prec_freq        <dbl> 215.60, 235.90, 193.50, 224.85, 239.30, 204.35, 2…
$ low_prec_dur         <dbl> 3.514262, 3.691706, 2.896707, 3.378663, 3.747847,…
$ low_prec_timing      <chr> "djf", "djf", "mam", "mam", "djf", "mam", "mam", …
$ geol_1st_class       <chr> "Siliciclastic sedimentary rocks", "Metamorphics"…
$ glim_1st_class_frac  <dbl> 0.5733054, 0.3084880, 0.4974583, 0.6808434, 0.451…
$ geol_2nd_class       <chr> "Metamorphics", "Acid plutonic rocks", "Metamorph…
$ glim_2nd_class_frac  <dbl> 0.2870100056, 0.2886125463, 0.3740616968, 0.16836…
$ carbonate_rocks_frac <dbl> 0.052140094, 0.000000000, 0.000000000, 0.00000000…
$ geol_porostiy        <dbl> 0.1178, 0.0522, 0.0711, 0.1455, 0.0251, 0.0100, 0…
$ geol_permeability    <dbl> -14.4918, -14.4819, -15.1658, -14.3578, -13.9903,…
$ soil_depth_pelletier <dbl> 19.0114144, 5.3596549, 1.3013493, 2.1270588, 3.75…
$ soil_depth_statsgo   <dbl> 1.4613632, 1.3927785, 1.4948074, 1.3696566, 1.500…
$ soil_porosity        <dbl> 0.4590910, 0.4227490, 0.4523257, 0.4240259, 0.404…
$ soil_conductivity    <dbl> 1.289807, 2.615154, 1.262995, 2.550001, 4.030392,…
$ max_water_content    <dbl> 0.6530198, 0.5611808, 0.6155380, 0.5470206, 0.617…
$ sand_frac            <dbl> 32.23546, 55.16313, 30.55767, 53.86201, 69.00674,…
$ silt_frac            <dbl> 51.77918, 34.18544, 52.61465, 34.73449, 22.94329,…
$ clay_frac            <dbl> 14.776824, 10.303622, 11.143326, 10.346323, 7.817…
$ water_frac           <dbl> 1.63434486, 0.00000000, 0.00000000, 0.00000000, 0…
$ organic_frac         <dbl> 1.330278, 0.000000, 0.000000, 0.000000, 0.000000,…
$ other_frac           <dbl> 0.0220161, 0.1478672, 5.6755267, 0.7780162, 0.000…
$ gauge_lat            <dbl> 45.50097, 44.86920, 44.87739, 44.64275, 44.30399,…
$ gauge_lon            <dbl> -68.30596, -69.95510, -71.05749, -70.58878, -70.5…
$ elev_mean            <dbl> 143.80, 310.38, 615.70, 423.12, 215.59, 784.85, 3…
$ slope_mean           <dbl> 12.79195, 49.92122, 60.05183, 72.81304, 32.68445,…
$ area_gages2          <dbl> 3676.17, 909.10, 383.82, 250.64, 190.92, 228.55, …
$ area_geospa_fabric   <dbl> 3676.09, 904.94, 396.10, 251.16, 197.70, 238.32, …
$ frac_forest          <dbl> 0.8782, 0.9906, 1.0000, 0.9916, 0.9415, 0.9972, 0…
$ lai_max              <dbl> 4.685200, 5.086811, 4.800830, 5.030033, 5.362949,…
$ lai_diff             <dbl> 3.665543, 4.300978, 4.124313, 4.319226, 4.587077,…
$ gvf_max              <dbl> 0.8585020, 0.8913828, 0.8800341, 0.8861635, 0.905…
$ gvf_diff             <dbl> 0.3513934, 0.4454732, 0.4773279, 0.4714315, 0.458…
$ dom_land_cover_frac  <dbl> 0.9752580, 0.8504500, 0.5935884, 0.5242488, 0.532…
$ dom_land_cover       <chr> "    Mixed Forests", "    Mixed Forests", "    Mi…
$ root_depth_50        <dbl> NA, 0.2410270, 0.2256153, 0.2214549, 0.2091862, 0…
$ root_depth_99        <dbl> NA, 2.340180, 2.237435, 2.209700, 2.073141, 2.364…
$ q_mean               <dbl> 1.8201075, 2.1828696, 2.4051046, 2.2798975, 1.823…
$ runoff_ratio         <dbl> 0.5558590, 0.6568684, 0.6446518, 0.6524836, 0.510…
$ slope_fdc            <dbl> 1.871110, 1.415939, 1.301062, 1.349312, 1.533232,…
$ baseflow_index       <dbl> 0.5084407, 0.4734649, 0.4596999, 0.4381317, 0.474…
$ stream_elas          <dbl> 1.3775052, 1.5102375, 1.0255555, 1.3665871, 1.280…
$ q5                   <dbl> 0.10714920, 0.19645805, 0.30596536, 0.18546495, 0…
$ q95                  <dbl> 6.854887, 8.095148, 8.669019, 8.441584, 6.866097,…
$ high_q_freq          <dbl> 12.25, 14.95, 14.10, 16.25, 13.85, 8.25, 4.15, 6.…
$ high_q_dur           <dbl> 7.205882, 2.577586, 2.517857, 2.056962, 2.429825,…
$ low_q_freq           <dbl> 89.25, 71.55, 58.90, 83.60, 82.95, 17.30, 35.15, …
$ low_q_dur            <dbl> 19.402174, 12.776786, 7.316770, 8.941176, 13.8250…
$ zero_q_freq          <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0…
$ hfd_mean             <dbl> 184.90, 184.80, 197.20, 186.05, 178.05, 189.15, 1…
$ logQmean             <dbl> 0.598895589, 0.780640330, 0.877593397, 0.82413047…
cv_folds <- vfold_cv(train_camels, v = 10)

cv_folds
#  10-fold cross-validation 
# A tibble: 10 × 2
   splits           id    
   <list>           <chr> 
 1 <split [452/51]> Fold01
 2 <split [452/51]> Fold02
 3 <split [452/51]> Fold03
 4 <split [453/50]> Fold04
 5 <split [453/50]> Fold05
 6 <split [453/50]> Fold06
 7 <split [453/50]> Fold07
 8 <split [453/50]> Fold08
 9 <split [453/50]> Fold09
10 <split [453/50]> Fold10

Q4b: Recipe

formula <- logQmean ~ p_mean + aridity + high_prec_dur

Describe in words why you are choosing the formula you are. Consult the downloaded PDF for the data to help you make this decision: I chose the predictor variables p_mean, aridity, and logQmean for the formula because they are likely key factors affecting mean daily discharge. Precipitation contributes water to the system, directly influencing discharge. Aridity, which indicates the dryness of the environment, is expected to result in lower logQmean in more arid areas. Additionally, I believe that high_prec_dur will be positively correlated with logQmean, as an increase in high precipitation events tends to raise the mean daily discharge.

train_camels <- na.omit(train_camels)
rec <- recipe(logQmean ~ p_mean + aridity + high_prec_dur, data = train_camels) %>%
  step_log(all_predictors()) %>%
  prep(training = train_camels, retain = TRUE)
rec <- recipe(logQmean ~ p_mean + aridity + high_prec_dur, data = train_camels) %>%
  step_naomit(all_predictors(), all_outcomes()) %>%  
  step_zv(all_predictors())  
rec_prep <- prep(rec, training = train_camels)
baked_data <- bake(rec_prep, new_data = NULL)
sum(is.na(baked_data)) 
[1] 0
sum(is.infinite(as.matrix(baked_data))) 
[1] 0

Q4c: Define 3 models ###Define a random forest model using the rand_forest function and set the engine to ranger and the mode to regression

Q4_rf_model <- rand_forest() %>%
  set_engine("ranger") %>%
  set_mode("regression")

Define two other models of your choice

Q4_lm_model <- linear_reg() %>%
  set_engine("lm") %>%
  set_mode("regression")
Q4_gbm_model <- boost_tree() %>%
  set_engine("xgboost") %>%
  set_mode("regression")

Q4d: workflow set

Q4_rf_wf <- workflow() |>
  add_recipe(rec) |>
  add_model(Q4_rf_model)

Q4_lm_wf <- workflow() |>
  add_recipe(rec) |>
  add_model(Q4_lm_model)

Q4_gbm_wf <- workflow() |>
  add_recipe(rec) |>
  add_model(Q4_gbm_model)
rf_results <- fit_resamples(Q4_rf_wf, resamples = cv_folds)
lm_results <- fit_resamples(Q4_lm_wf, resamples = cv_folds)
gbm_results <- fit_resamples(Q4_gbm_wf, resamples = cv_folds)

Q4e: Evaluation

wf <- workflow_set(list(rec), list(Q4_rf_model, Q4_lm_model, Q4_gbm_model)) %>%
  workflow_map('fit_resamples', resamples = camels_cv) 

autoplot(wf)

rank_results(wf, rank_metric = "rsq", select_best = TRUE)
# A tibble: 6 × 9
  wflow_id          .config .metric  mean std_err     n preprocessor model  rank
  <chr>             <chr>   <chr>   <dbl>   <dbl> <int> <chr>        <chr> <int>
1 recipe_rand_fore… Prepro… rmse    0.533  0.0292    10 recipe       rand…     1
2 recipe_rand_fore… Prepro… rsq     0.792  0.0262    10 recipe       rand…     1
3 recipe_boost_tree Prepro… rmse    0.549  0.0281    10 recipe       boos…     2
4 recipe_boost_tree Prepro… rsq     0.780  0.0248    10 recipe       boos…     2
5 recipe_linear_reg Prepro… rmse    0.621  0.0318    10 recipe       line…     3
6 recipe_linear_reg Prepro… rsq     0.731  0.0241    10 recipe       line…     3

Describe what model you think is best and why! After looking at the evaluation results, I’d probably go with either the Random Forest or Gradient Boosting model, due to the R-squared values. Both of these models are better at handling complex, non-linear relationships in the data, making them more reliable than the simpler Linear Regression. If one of the ensemble models stands out, it’d be the clear winner for its ability to capture more detailed patterns.

Q4f: Extract and Evaluate

final_workflow <- workflow() |>
  add_recipe(rec) |>
  add_model(Q4_rf_model) |>
  fit(data = train_camels)
final_workflow_data <- augment(final_workflow, new_data = camels_test)
ggplot(final_workflow_data, aes(x = .pred, y = logQmean, colour = logQmean)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, color = "darkred", linetype = "dashed") +
  labs(title = "Observed vs. Predicted Values",
       x = "Predicted logQmean",
       y = "Observed logQmean") +
  scale_color_viridis_c()

Describe what you think of the results! The results are quite accurate, with the observed versus predicted logQmean values closely following the 1:1 line, indicating strong predictive performance. This suggests that the model is effectively capturing the relationship between the selected predictors and logQmean.